The aim of this markdown document is to briefly summarise breeding success data, but mainly to explore the potential environmental variables that I’ve selected for now.
The breeding traits selected as response are first egg lay date, clutch size and fledgeling success. Here I show both their distribution and their raw data (with noise).
First, let’s take a look at sample sizes per age (both age and age at last reproduction attempt):
birds_per_age <- blutidf %>% count(yo)
birds_per_age_3yo <- blutidf_3yo %>% count(yo)
plot1 <- ggplot(blutidf, aes(x=yo)) +
geom_freqpoly(colour="#248fc9", binwidth=1, linejoin = "round") +
geom_point(stat="bin", aes(y=after_stat(count)), binwidth=1, colour="black", size = 2) + # I would like to add a label with the frequencies
geom_label(stat="count", label = birds_per_age$n, vjust=-0.5) +
theme_bw() +
scale_x_continuous(breaks=seq(1,7,1), limits=c(1,7)) +
scale_y_continuous(limits=c(0,820)) +
labs(x = "Age", y = "Number of recorded breeding attempts", title = "Overall number of female recordings per age group") +
theme( axis.title = element_text(size = 13),
axis.text = element_text(size = 11),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
blutidf_3yo_fed <- blutidf_3yo %>% filter(!is.na(fed))
birds_per_age_3yo_fed <- blutidf_3yo_fed %>% count(yo)
birds_per_age_3yo_fed_w <- blutidf_3yo_fed %>% count(w)
birds_per_age_3yo_fed # here we can see the number of observations per age group
## yo n
## 1 1 581
## 2 2 481
## 3 3 159
birds_per_age_3yo_fed_w # and here we can see the number of observations per age-at-last-reproduction group
## w n
## 1 1 410
## 2 2 391
## 3 3 229
## 4 4 111
## 5 5 60
## 6 6 18
## 7 7 2
plot1 <- ggplot(blutidf_3yo_fed, aes(x=yo)) +
geom_freqpoly(colour="#248fc9", binwidth=1, linejoin = "round") +
geom_point(stat="bin", aes(y=after_stat(count)), binwidth=1, colour="black", size = 2) + # I would like to add a label with the frequencies
geom_label(stat="count", label = birds_per_age_3yo_fed$n, vjust=-0.5) +
theme_bw() +
scale_x_continuous(breaks=seq(1,3,1), limits=c(1,3)) +
scale_y_continuous(limits=c(0,700)) +
labs(x = "Age", y = "Number of recorded breeding attempts", title = "Overall number of female recordings per age (yo) group") +
theme( axis.title = element_text(size = 13),
axis.text = element_text(size = 11),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
plot2 <- ggplot(blutidf_3yo_fed, aes(x=w)) +
geom_freqpoly(colour="#248fc9", binwidth=1, linejoin = "round") +
geom_point(stat="bin", aes(y=after_stat(count)), binwidth=1, colour="black", size = 2) + # I would like to add a label with the frequencies
geom_label(stat="count", label = birds_per_age_3yo_fed_w$n, vjust=-0.5) +
theme_bw() +
scale_x_continuous(breaks=seq(1,7,1), limits=c(1,7)) +
scale_y_continuous(limits=c(0,500)) +
labs(x = "Age at last reproduction (w)", y = "Number of recorded breeding attempts", title = "Overall number of female recordings per age-at-last-reproduction (w) group") +
theme( axis.title = element_text(size = 13),
axis.text = element_text(size = 11),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
grid.arrange(plot1, plot2, ncol=2, nrow = 1)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_path()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_path()`).
Next, let’s take a look at the histogram for fed:
ggplot(blutidf_3yo, aes(fed)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency") +
theme_bw() +
scale_x_continuous(breaks=seq(90,200,5)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 297 rows containing non-finite outside the scale range
## (`stat_bin()`).
At first glance, although it looks a bit shifted, first egg lay date appears to be more or less Normally distributed:
qqnorm(blutidf_3yo$fed); qqline(blutidf_3yo$fed, col = 2)
The observations mostly align with the diagonal that crosses through the first and third quantiles of the theoretical Normal distribution, except for the tails of the data distribution.
Once we’ve looked at the trait distribution overall, we can take a look at how this distribution changes when we plot observations both by age and by age at last reproductive attempt:
ggplot(blutidf_3yo, aes(fed)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency", title = "First egg lay date per age (yo) group") +
theme_bw() +
facet_wrap(~ yo) +
scale_x_continuous(breaks=seq(90,200,5)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 297 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(blutidf_3yo, aes(fed)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency", title = "First egg lay date per age-at-last-reproduction (w) group") +
theme_bw() +
facet_wrap(~ w) +
scale_x_continuous(breaks=seq(90,200,5)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 297 rows containing non-finite outside the scale range
## (`stat_bin()`).
find_mode <- function(x) {
u <- unique(x)
tab <- tabulate(match(x, u))
u[tab == max(tab)]
} # using a function that will calculate the mode for each of the plotted distributions
sd_fed <- sd(na.omit(blutidf_3yo$fed)) # estimating standard deviation
se_1 <- sd_fed/sqrt(length(na.omit(blutidf_3yo$fed))) # estimating SE of the mean omitting rows (cases) with NA values in variable fed and interpreting number of cases as sample size, not unique individuals
se_2 <- sd_fed/sqrt(length(unique(na.omit(blutidf_3yo$ring)))) # estimating SE of the mean omitting rows (cases) with NA values in variable fed and interpreting number of unique individuals as sample size, not overall cases
# I would like to highlight here that I'm not entirely sure which SE is the correct one, but since what I'm plotting is the breeding trait value associated with each observation and not with each unique bird (unique birds could have several reproductive values if they've been recorded more than once), and I'm plotting mean value of all observations and not of all unique birds, I would say that, for the purpose of plotting raw data overlapped by mean +- SE, the correct way to go would be the first formula (se_1), assuming that I'm not wrong about the formula of the SE of the mean.
se <- function(x) {
sd(x)/(sqrt(length(x)))
} # this rudimentary function should be able to calculate the SE based on the first formula I presented...
se(na.omit(blutidf_3yo$fed)) # ...as shown here
## [1] 0.21248
blutidf_3yo %>% group_by(yo) %>% na.omit(blutidf_3yo$fed) %>% summarise_at(vars(fed), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max)) # summarising the data to mean, standard deviation, mode, min. value and max. value
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the dplyr package.
## Please report the issue at <https://github.com/tidyverse/dplyr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 4 × 7
## # Groups: yo [3]
## yo mean sd SE mode min max
## <int> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 1 122. 7.73 0.353 122 100 146
## 2 2 121. 7.62 0.384 125 100 148
## 3 2 121. 7.62 0.384 117 100 148
## 4 3 119 6.17 0.539 116 104 141
The main thing that catches one’s attention is the decrease in frequency as age increases. One can also get the impression that the mean increases as age increases, however the decrease between age groups is of barely 1-2 days on average (as we can see on the summary table above). As for the range of values, the distribution appears to be slightly narrower for observations at age 3 (y.o.) (the earlier recorded first egg lay date is 4 days later than for younger birds but the latest is at least 5 days earlier than the last first egg lay date observed in younger birds).
Now, as the warning in the histogram plot claims, there are 297 observations that have been removed:
head(blutidf[which(is.na(blutidf$fed)),])
## X ring year site box fed cs suc suc_prop hatchyear age yo w case
## 69 69 Z632546 2015 BAD 1 NA 8 0 0.0000000 2013 6 2 3 69
## 70 70 Z632558 2015 BAD 2 NA 5 5 1.0000000 2014 5 1 1 70
## 71 71 Z632638 2015 BAD 3 NA 6 3 0.5000000 2014 5 1 4 71
## 72 72 Z632550 2015 BAD 4 NA 8 6 0.7500000 2014 5 1 2 72
## 73 73 Z632551 2015 BAD 5 NA 7 5 0.7142857 2014 5 1 1 73
## 88 88 Z081800 2015 MCH 1 NA 5 5 1.0000000 2014 5 1 1 88
Quite a few of these belong to sites that are visited less often (most sites in the Northern part of the transect, starting with HWP and until DOR, as well as sites in the southern transect that after a certain year started to receive visits less often: BAD, DLW and DUN) and, therefore, estimation of first egg lay date based on the number of eggs present in the nest box once its inspected is less accurate. However, and potentially more relevant, most of these fed-lacking observations belong to the year 2020, year in which the sanitary emergency caused by the COVID-19 pandemic provided an obstacle to conduct fieldwork following the standard protocol. The effect of this event becomes even more clear when plotting raw data vs. year, where we can see the low number of “noisy” data points in year 2020:
bluti_fed_summary <- blutidf_3yo %>%
group_by(year) %>%
na.omit(bluti_fed_summary) %>%
summarise_at(vars(fed), list(mean=mean, SE=se)) %>%
as.data.frame() # summarising first egg lay date observations by calculating its mean and standard deviation
ggplot() +
geom_jitter(data=blutidf_3yo, aes(x=year, y=fed), size=2, alpha=0.25, colour = "#248fc9") +
geom_errorbar(data=bluti_fed_summary, aes(x=year, y=mean, ymin=mean-SE, ymax=mean+SE)) +
geom_point(data=bluti_fed_summary, aes(x=year, y=mean), colour = "black", size = 2, alpha = 0.8) +
labs(x = "Year", y = "First egg lay date (1st Jan = 1)", title = "First egg lay date raw data and mean+sd") +
theme_bw() +
scale_x_continuous(breaks=seq(2014,2024,1)) +
scale_y_continuous(breaks=seq(90,200,5)) +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
## Warning: Removed 297 rows containing missing values or values outside the scale range
## (`geom_point()`).
I would like to point out that I am a bit skeptical about the calculated Standard Errors as they look a bit too narrow for me, so I apologise in advance as I might have made a mistake calculating them. However, I will keep using the formula throughout this document simply to maintain consistency throughout.
Something that I also find quite interesting is how in some years the points appear to be more clustered (2018) while in others years they appear to be more scattered (2020).
ggplot(blutidf_3yo, aes(fed)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency") +
theme_bw() +
facet_wrap(~ year) +
scale_x_continuous(breaks=seq(90,200,5)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 297 rows containing non-finite outside the scale range
## (`stat_bin()`).
This plot shows that, indeed, fed distribution in 2021
is very wide and low, while it’s much narrower in 2018 (with the
exception of an outlier). Apart from this, another thing that catches my
eye is how flat distributions look like overall (with exceptions f.e.
2019). I am not sure whether this is relevant or not as we won’t study
variation across years (but rather control for it), and whether this is
a Normal-enough behaviour, but I do think that different years show
different traits and looking at population composition per year might
also be interesting:
blutidf_3yo %>%
count(year,yo) %>%
pivot_wider(names_from=year,values_from=c(n))
## # A tibble: 3 × 12
## yo `2014` `2015` `2016` `2017` `2018` `2019` `2020` `2021` `2022` `2023`
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 27 72 33 85 70 97 89 59 65 68
## 2 2 28 31 59 47 70 46 61 78 51 71
## 3 3 NA 5 11 17 23 26 16 24 31 22
## # ℹ 1 more variable: `2024` <int>
The number of observations of 3 years-old individuals per year is quite low overall, and it’s almost always less than 20% of the population (2022 is the sole exception); birds aged 3 years old usually represent between 10 and 15% of the total of the (female) population.
Now, let’s see how this breeding trait changes across sites:
bluti_fed_summary <- blutidf_3yo %>%
group_by(site) %>%
na.omit(bluti_fed_summary) %>%
summarise_at(vars(fed), list(mean=mean, SE=se)) %>%
as.data.frame() # summarising first egg lay date observations by calculating its mean and standard deviation
level_order <- c("EDI", "RSY", "FOF", "BAD", "LVN", "DOW", "GLF", "SER", "MCH", "PTH", "STY", "BIR", "DUN", "BLG", "PIT", "KCK", "KCZ", "BLA", "CAL", "DNM", "DNC", "DNS", "DLW", "CRU", "NEW", "HWP", "INS", "FSH", "RTH", "AVI", "AVN", "CAR", "SLS", "TOM", "DAV", "ART", "MUN", "FOU", "ALN", "DEL", "TAI", "SPD", "OSP", "DOR")
ggplot(blutidf_3yo, aes(x=site, y=fed)) +
geom_jitter(size=1.5, alpha=0.25, colour = "#248fc9") +
geom_errorbar(data=bluti_fed_summary, aes(x=site, y=mean, ymin=mean-SE, ymax=mean+SE)) +
geom_point(data=bluti_fed_summary, aes(x=site, y=mean), colour = "black", size = 2) +
labs(x = "Years", y = "First egg lay date (1st Jan = 1)") +
theme_bw() +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 30),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
## Warning: Removed 297 rows containing missing values or values outside the scale range
## (`geom_point()`).
At first glance, there appears to be sites on which, on average, first egg lay date is earlier and where it appears to be later (also on average). On average, first egg lay date lies between day 109 and 131 across sites, with some sites having first egg lay date as early as day ~100 and as late as day ~146. To me, this suggest that there is are important differences between sites (as we initially expected).
blutidf_3yo_cs <- blutidf_3yo %>% filter(!is.na(cs))
birds_per_age_3yo_cs <- blutidf_3yo_cs %>% count(yo)
birds_per_age_3yo_cs_w <- blutidf_3yo_cs %>% count(w)
birds_per_age_3yo_cs # here we can see the number of observations per age group
## yo n
## 1 1 704
## 2 2 575
## 3 3 190
birds_per_age_3yo_cs_w # and here we can see the number of observations per age-at-last-reproduction group
## w n
## 1 1 497
## 2 2 480
## 3 3 276
## 4 4 124
## 5 5 68
## 6 6 21
## 7 7 3
plot1 <- ggplot(blutidf_3yo_cs, aes(x=yo)) +
geom_freqpoly(colour="#248fc9", binwidth=1, linejoin = "round") +
geom_point(stat="bin", aes(y=after_stat(count)), binwidth=1, colour="black", size = 2) + # I would like to add a label with the frequencies
geom_label(stat="count", label = birds_per_age_3yo_cs$n, vjust=-0.5) +
theme_bw() +
scale_x_continuous(breaks=seq(1,3,1), limits=c(1,3)) +
scale_y_continuous(limits=c(0,800)) +
labs(x = "Age", y = "Number of recorded breeding attempts", title = "Overall number of female recordings per age (yo) group") +
theme( axis.title = element_text(size = 13),
axis.text = element_text(size = 11),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
plot2 <- ggplot(blutidf_3yo_cs, aes(x=w)) +
geom_freqpoly(colour="#248fc9", binwidth=1, linejoin = "round") +
geom_point(stat="bin", aes(y=after_stat(count)), binwidth=1, colour="black", size = 2) + # I would like to add a label with the frequencies
geom_label(stat="count", label = birds_per_age_3yo_cs_w$n, vjust=-0.5) +
theme_bw() +
scale_x_continuous(breaks=seq(1,7,1), limits=c(1,7)) +
scale_y_continuous(limits=c(0,800)) +
labs(x = "Age at last reproduction (w)", y = "Number of recorded breeding attempts", title = "Overall number of female recordings per age-at-last-reproduction (w) group") +
theme( axis.title = element_text(size = 13),
axis.text = element_text(size = 11),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
grid.arrange(plot1, plot2, ncol=2, nrow = 1)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_path()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_path()`).
As with fed, let’s take a look at cs
histogram:
ggplot(blutidf_3yo, aes(cs)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency") +
theme_bw() +
scale_x_continuous(breaks=seq(0,15,1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_bin()`).
The sampling distribution of cs appears to be more
normally distributed than the sampling distribution of fed,
which should make things easier.
qqnorm(blutidf_3yo$cs); qqline(blutidf_3yo$cs, col = 2)
However, the Q-Q plot looks odd. As with
fed, the plot
highlight that the data is count data, for which (if I’m not mistaken)
the best distribution to model it would be a Poisson distribution,
although Sanjana mentioned that this might not be necessary since it’s
been shown that the model outputs usually don’t change as much. Apart
from that, it also looks like the distribution is light-tailed according
to the shape of the Q-Qplot.
Apologies in advance if the interpretation of the Q-Qplots is wrong; I haven’t had many chances to study and interpret them so I’m most likely completely off here.
Again, we can now take a look at how this distribution changes when we plot observations both by age and by age at last reproductive attempt:
ggplot(blutidf_3yo, aes(cs)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency", title = "Clutch size per age (yo) group") +
theme_bw() +
facet_wrap(~ yo) +
scale_x_continuous(breaks=seq(0,16,1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_bin()`).
# using a function that will calculate the mode for each of the plotted distributions
ggplot(blutidf_3yo, aes(cs)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency", title = "Clutch size per age-at-last-reproduction (w) group") +
theme_bw() +
facet_wrap(~ w) +
scale_x_continuous(breaks=seq(0,16,1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_bin()`).
# using a function that will calculate the mode for each of the plotted distributions
blutidf_3yo %>% group_by(yo) %>% na.omit(blutidf_3yo$cs) %>% summarise_at(vars(cs), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max)) # summarising the data to mean, standard deviation, mode, min. value and max. value
## # A tibble: 3 × 7
## yo mean sd SE mode min max
## <int> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 1 8.10 1.97 0.0899 8 1 14
## 2 2 8.26 1.81 0.0912 8 2 14
## 3 3 8.37 1.88 0.164 8 4 13
When we look at the mean clutch size per age group, we can see that, on average, it increases as age increases: it increases by 0.16 eggs (around 2% increase) from 1 y.o. to 2 y.o and by 0.11 eggs (around 1% increase) from 2 y.o. to 3 y.o. In all cases, the most frequent clutch size is 8 eggs, although the minimum increases as age increases and the maximum seems to decrease at age 3 (from 14 eggs at both 1 y.o. and 2 y.o. to 13 at 3 y.o.).
Let’s summarise by age at last reproductive attempt now:
blutidf_3yo %>% group_by(w) %>% na.omit(blutidf_3yo$cs) %>% summarise_at(vars(cs), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max)) # summarising the data to mean, standard deviation, mode, min. value and max. value
## # A tibble: 6 × 7
## w mean sd SE mode min max
## <int> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 1 8.10 1.99 0.107 7 1 14
## 2 2 8.23 1.75 0.0985 8 3 14
## 3 3 8.24 1.93 0.139 9 2 13
## 4 4 8.34 2.05 0.215 8 3 13
## 5 5 8.34 1.68 0.254 7 6 13
## 6 6 7.93 2.09 0.539 8 4 13
The one bird that is believed to be 7 years old is not included in the summary table, and I’m not sure why.
On average, clutch size also increases up until 6 y.o. at last reproductive attempt (although SE is greater). Mode is greatest for birds whose age at last reproductive attempt is estimated to be 2 (9 eggs), and the maximum clutch size is found in birds that are estimated to reproduce only once or twice. Without further insight, my first though is that birds that invest more energy and resources from their first reproductive attempt wear out sooner and die sooner, whereas birds that don’t invest as much in each reproductive attempt that they undergo manage to save enough energy to live another year and, therefore, reproduce once more. It’s interesting though how clutch size still increases on average (until 6 y.o. at last reproductive attempt).
head(blutidf[which(is.na(blutidf$cs)),])
## X ring year site box fed cs suc suc_prop hatchyear age yo w case
## 921 921 AXH9493 2020 HWP 7 NA NA 3 NA 2018 6 2 2 921
## 950 950 AXH8144 2020 TOM 1 NA NA 0 NA 2019 5 1 1 950
## 951 951 AXH8145 2020 TOM 8 NA NA 6 NA 2019 5 1 1 951
## 952 952 ARD7733 2020 DAV 1 NA NA 1 NA 2018 6 2 3 952
## 953 953 AXH7566 2020 DAV 2 NA NA 5 NA 2019 5 1 2 953
## 954 954 AXH8134 2020 DAV 5 NA NA 5 NA 2019 5 1 1 954
All missing values belong to years 2020 (highlighting once again the effect that COVID had on data collection) and 2021 and sites belonging to the Northern part of the transect.
bluti_cs_summary <- blutidf_3yo %>%
group_by(year) %>%
na.omit(bluti_cs_summary) %>%
summarise_at(vars(cs), list(mean=mean, SE=se)) %>%
as.data.frame() # summarising first egg lay date observations by calculating its mean and standard deviation
ggplot() +
geom_jitter(data=blutidf_3yo, aes(x=year, y=cs), size=2, alpha=0.25, colour = "#248fc9") +
geom_errorbar(data=bluti_cs_summary, aes(x=year, y=mean, ymin=mean-SE, ymax=mean+SE)) +
geom_point(data=bluti_cs_summary, aes(x=year, y=mean), colour = "black", size = 2, alpha = 0.8) +
labs(x = "Year", y = "Clutch size", title = "Clutch size raw data and mean+sd") +
theme_bw() +
scale_x_continuous(breaks=seq(2014,2024,1)) +
scale_y_continuous(breaks=seq(0,15,1)) +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
## Warning: Removed 49 rows containing missing values or values outside the scale range
## (`geom_point()`).
Unlike with fed, average values remain quite close among
years, with a few exceptions (e.g. 2019 and 2020). I guess it’s
interesting to take this into account when fitting random effects.
bluti_cs_summary <- blutidf_3yo %>%
group_by(site) %>%
na.omit(bluti_cs_summary) %>%
summarise_at(vars(cs), list(mean=mean, SE=se)) %>%
as.data.frame() # summarising first egg lay date observations by calculating its mean and standard deviation
ggplot(blutidf_3yo, aes(x=site, y=cs)) +
geom_jitter(size=1.5, alpha=0.25, colour = "#248fc9") +
geom_errorbar(data=bluti_cs_summary, aes(x=site, y=mean, ymin=mean-SE, ymax=mean+SE)) +
geom_point(data=bluti_cs_summary, aes(x=site, y=mean), colour = "black", size = 2) +
labs(x = "Sites", y = "Clutch size") +
theme_bw() +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 30),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order) +
scale_y_continuous(breaks=seq(0,15,1))
## Warning: Removed 49 rows containing missing values or values outside the scale range
## (`geom_point()`).
There seems to be a lot of variation in clutch size across sites, although on average clutch size remains between ~7 and ~10 eggs:
cs_by_site_summary <- blutidf_3yo %>% group_by(site) %>% na.omit(blutidf_3yo$cs) %>% summarise_at(vars(cs), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max))
cs_by_site_summary <- cs_by_site_summary %>% arrange(factor(site, levels = level_order), .before=mean)
cs_by_site_summary
## # A tibble: 56 × 7
## # Groups: site [44]
## site mean sd SE mode min max
## <chr> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 EDI 7.8 2.05 0.325 9 1 13
## 2 RSY 8.2 1.30 0.237 8 5 11
## 3 FOF 8.5 1.52 0.214 8 6 12
## 4 BAD 7.97 1.71 0.313 7 3 13
## 5 LVN 7.41 1.83 0.270 7 2 11
## 6 DOW 7.75 1.45 0.296 8 5 11
## 7 GLF 7.83 1.68 0.280 8 4 11
## 8 SER 9.04 1.97 0.287 9 6 14
## 9 MCH 8.61 2.33 0.486 10 3 13
## 10 PTH 9.30 2.17 0.357 10 5 14
## # ℹ 46 more rows
As with fed this suggests that, considering that sites
differ in habitat, cs varies depending on the habitat and
its quality (although it should also be taken into consideration that,
according to Ally, birds stay more or less “faithful” to breeding sites,
usually returning to the same site or to a nearby site year after year;
as a matter of fact, the oldest recorded bird in the transect has always
bred in nest box 1 in EDI site).
blutidf_3yo_suc <- blutidf_3yo %>% filter(!is.na(suc))
birds_per_age_3yo_suc <- blutidf_3yo_suc %>% count(yo)
birds_per_age_3yo_suc_w <- blutidf_3yo_suc %>% count(w)
birds_per_age_3yo_suc # here we can see the number of observations per age group
## yo n
## 1 1 627
## 2 2 509
## 3 3 167
birds_per_age_3yo_suc_w # and here we can see the number of observations per age-at-last-reproduction group
## w n
## 1 1 446
## 2 2 429
## 3 3 249
## 4 4 108
## 5 5 52
## 6 6 18
## 7 7 1
plot1 <- ggplot(blutidf_3yo_suc, aes(x=yo)) +
geom_freqpoly(colour="#248fc9", binwidth=1, linejoin = "round") +
geom_point(stat="bin", aes(y=after_stat(count)), binwidth=1, colour="black", size = 2) + # I would like to add a label with the frequencies
geom_label(stat="count", label = birds_per_age_3yo_suc$n, vjust=-0.5) +
theme_bw() +
scale_x_continuous(breaks=seq(1,3,1), limits=c(1,3)) +
scale_y_continuous(limits=c(0,800)) +
labs(x = "Age", y = "Number of recorded breeding attempts", title = "Overall number of female recordings per age (yo) group") +
theme( axis.title = element_text(size = 13),
axis.text = element_text(size = 11),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
plot2 <- ggplot(blutidf_3yo_suc, aes(x=w)) +
geom_freqpoly(colour="#248fc9", binwidth=1, linejoin = "round") +
geom_point(stat="bin", aes(y=after_stat(count)), binwidth=1, colour="black", size = 2) + # I would like to add a label with the frequencies
geom_label(stat="count", label = birds_per_age_3yo_suc_w$n, vjust=-0.5) +
theme_bw() +
scale_x_continuous(breaks=seq(1,7,1), limits=c(1,7)) +
scale_y_continuous(limits=c(0,800)) +
labs(x = "Age at last reproduction (w)", y = "Number of recorded breeding attempts", title = "Overall number of female recordings per age-at-last-reproduction (w) group") +
theme( axis.title = element_text(size = 13),
axis.text = element_text(size = 11),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
grid.arrange(plot1, plot2, ncol=2, nrow = 1)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_path()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_path()`).
Next, let’s take a look at the histogram for suc:
ggplot(blutidf_3yo, aes(suc)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "Number of chicks successfully fledged", y = "Frequency") +
theme_bw() +
scale_x_continuous(breaks=seq(0,16,1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 215 rows containing non-finite outside the scale range
## (`stat_bin()`).
The suc histogram clearly shows a spike in 0s. Sanjana
suggested using a Hurdle model to deal with this.
For suc, I was also interested in seeing the proportion
of chicks that fledged (comparing to the initial clutch size). This
proportion will take the complete clutch into account, rather than the
number of hatched eggs. However, it would be potentially more accurate
to use the number of hatched eggs as base (maybe substracting the number
of unhatched eggs found in V1 to the clutch size), but for now I’ll use
clutch size.
ggplot(blutidf_3yo, aes(suc_prop)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=0.1) +
labs(x = "Proportion of chicks successfully fledged", y = "Frequency") +
theme_bw() +
scale_x_continuous(breaks=seq(0,1,0.1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 264 rows containing non-finite outside the scale range
## (`stat_bin()`).
This is obviously not a Normal distribution; maybe exponential with a spike in 0? (If that’s possible)
qqnorm(blutidf_3yo$suc_prop); qqline(blutidf_3yo$suc_prop, col = 2)
Let’s look at how this distribution changes when we plot observations both by age and by age at last reproductive attempt:
ggplot(blutidf_3yo, aes(suc)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency", title = "Clutch size per age (yo) group") +
theme_bw() +
facet_wrap(~ yo) +
scale_x_continuous(breaks=seq(0,16,1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 215 rows containing non-finite outside the scale range
## (`stat_bin()`).
# using a function that will calculate the mode for each of the plotted distributions
Something quite interesting and that pops up straight aways is how
the spike in 0 is smaller as age increases (although overall frequency
decreases as age increases, as in fed and in
cs).
ggplot(blutidf_3yo, aes(suc)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency", title = "Clutch size per age-at-last-reproduction (w) group") +
theme_bw() +
facet_wrap(~ w) +
scale_x_continuous(breaks=seq(0,16,1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 215 rows containing non-finite outside the scale range
## (`stat_bin()`).
# using a function that will calculate the mode for each of the plotted distributions
The value 0 seems to be always a bit too frequent (except for the bird that is 7 years old, where the sole value is 8 as the two previous recordings are NAs).
Now, what if we look at the proportion of chicks successfully fledge per age and age at last reproduction group?:
ggplot(blutidf_3yo, aes(suc_prop)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=0.1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency", title = "Clutch size per age (yo) group") +
theme_bw() +
facet_wrap(~ yo) +
scale_x_continuous(breaks=seq(0,1,0.1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 264 rows containing non-finite outside the scale range
## (`stat_bin()`).
# using a function that will calculate the mode for each of the plotted distributions
ggplot(blutidf_3yo, aes(suc_prop)) +
geom_histogram(colour = "black", fill = "#248fc9", binwidth=0.1) +
labs(x = "First egg lay date (1st Jan = 1)", y = "Frequency", title = "Clutch size per age-at-last-reproduction (w) group") +
theme_bw() +
facet_wrap(~ w) +
scale_x_continuous(breaks=seq(0,1,0.1)) +
theme(axis.title = element_text(size = 14,), axis.text = element_text(size = 12, angle = 30))
## Warning: Removed 264 rows containing non-finite outside the scale range
## (`stat_bin()`).
# using a function that will calculate the mode for each of the plotted distributions
blutidf_3yo %>% group_by(yo) %>% na.omit(blutidf_3yo$suc) %>% summarise_at(vars(suc), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max)) # summarising the data to mean, standard deviation, mode, min. value and max. value
## # A tibble: 3 × 7
## yo mean sd SE mode min max
## <int> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 1 5.42 3.04 0.139 6 0 13
## 2 2 5.96 2.88 0.145 7 0 12
## 3 3 5.93 2.55 0.223 7 0 11
When we look at the mean fledgeling success per age group, we can see that it increases ~10% from age 1 to age 2 but it then decreases from age 2 to age 3 (it decreases ~0.04%), although (once again) the SE is greater. The mode is also greater at ages 2 and 3 in comparison to age 1, but the maximum number of successfully fledged chicks decreases (which is somewhat expected given that we previously saw that the maximum clutch size is at ages 1 and 2).
We could look at the summary for suc_prop now:
blutidf_3yo %>% group_by(yo) %>% na.omit(blutidf_3yo$suc_prop) %>% summarise_at(vars(suc_prop), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max))
## # A tibble: 3 × 7
## yo mean sd SE mode min max
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.673 0.332 0.0152 1 0 1
## 2 2 0.715 0.304 0.0153 1 0 1
## 3 3 0.719 0.274 0.0240 1 0 1
Here, the proportion of chicks successfully fledged increases as age increases (~4% from 1 y.o. to 2 y.o. and ~0.4% from 2 y.o. to 3 y.o.).
Let’s summarise by age at last reproductive attempt now:
blutidf_3yo %>% group_by(w) %>% na.omit(blutidf_3yo$suc) %>% summarise_at(vars(suc), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max))
## # A tibble: 6 × 7
## w mean sd SE mode min max
## <int> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 1 5.17 3.07 0.165 6 0 12
## 2 2 5.97 2.90 0.163 7 0 13
## 3 3 6.10 2.80 0.202 6 0 12
## 4 4 5.89 2.46 0.257 5 0 11
## 5 5 5.84 2.97 0.448 7 0 11
## 6 6 5.2 2.73 0.705 5 0 9
Here it seems that fledgeling success increases up until w=3 and then it decreases, and the maximum number of successfully fledged chicks follows a similar trend.
blutidf_3yo %>% group_by(w) %>% na.omit(blutidf_3yo$suc_prop) %>% summarise_at(vars(suc_prop), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max))
## # A tibble: 9 × 7
## # Groups: w [6]
## w mean sd SE mode min max
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.646 0.343 0.0184 1 0 1
## 2 2 0.718 0.305 0.0172 1 0 1
## 3 3 0.740 0.288 0.0208 1 0 1
## 4 4 0.717 0.261 0.0273 1 0 1
## 5 5 0.698 0.323 0.0487 1 0 1
## 6 6 0.660 0.327 0.0844 0 0 1
## 7 6 0.660 0.327 0.0844 1 0 1
## 8 6 0.660 0.327 0.0844 0.875 0 1
## 9 6 0.660 0.327 0.0844 0.714 0 1
That trend also applies when we look at proportion of successfully fledged chicks: average proportion is greater at w=3.
head(blutidf[which(is.na(blutidf$suc)),])
## X ring year site box fed cs suc suc_prop hatchyear age yo w case
## 272 272 Z632446 2017 EDI 2 106 9 NA NA 2014 6 3 5 272
## 273 273 S922356 2017 EDI 3 114 10 NA NA 2016 5 1 4 273
## 275 275 S229242 2017 EDI 6 113 6 NA NA 2016 5 1 3 275
## 276 276 S922813 2017 EDI 8 110 10 NA NA 2016 5 1 3 276
## 277 277 S922747 2017 RSY 1 114 9 NA NA 2016 5 1 2 277
## 278 278 S922815 2017 RSY 2 121 9 NA NA 2015 6 2 2 278
In this case, most NAs values belong to cases involved in the clutch swap experiments (that took place between 2017 and 2019). It should also include all cases where suc = -999. These are cases where nest boxes were predated, which, for now, I have decided to consider as NAs rather than as 0s as it does not necessarily reflect neither between- nor within-individual processes influencing breeding trait trends and, therefore, could obscure the results.
bluti_suc_summary <- blutidf_3yo %>%
group_by(year) %>%
na.omit(bluti_suc_summary) %>%
summarise_at(vars(suc), list(mean=mean, SE=se)) %>%
as.data.frame() # summarising first egg lay date observations by calculating its mean and standard deviation
ggplot() +
geom_jitter(data=blutidf_3yo, aes(x=year, y=suc), size=2, alpha=0.25, colour = "#248fc9") +
geom_errorbar(data=bluti_suc_summary, aes(x=year, y=mean, ymin=mean-SE, ymax=mean+SE)) +
geom_point(data=bluti_suc_summary, aes(x=year, y=mean), colour = "black", size = 2, alpha = 0.8) +
labs(x = "Year", y = "Fledgeling success", title = "Fledgeling success raw data and mean+sd") +
theme_bw() +
scale_x_continuous(breaks=seq(2014,2024,1)) +
scale_y_continuous(breaks=seq(0,15,1)) +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
## Warning: Removed 215 rows containing missing values or values outside the scale range
## (`geom_point()`).
There’s some variation between years, and what catches my eye the most is the extremely low fledgeling success in year 2024. However, this is expected as we know that last year was a particularly bad year. According to this, both 2015 and 2020 might have also been bad years, while it seems that 2014 has been the best year so far.
bluti_suc_summary <- blutidf_3yo %>%
group_by(site) %>%
na.omit(bluti_suc_summary) %>%
summarise_at(vars(suc), list(mean=mean, SE=se)) %>%
as.data.frame() # summarising first egg lay date observations by calculating its mean and standard deviation
ggplot(blutidf_3yo, aes(x=site, y=suc)) +
geom_jitter(size=1.5, alpha=0.25, colour = "#248fc9") +
geom_errorbar(data=bluti_suc_summary, aes(x=site, y=mean, ymin=mean-SE, ymax=mean+SE)) +
geom_point(data=bluti_suc_summary, aes(x=site, y=mean), colour = "black", size = 2) +
labs(x = "Sites", y = "Number of chicks successfully fledged") +
theme_bw() +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 30),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order) +
scale_y_continuous(breaks=seq(0,15,1))
## Warning: Removed 215 rows containing missing values or values outside the scale range
## (`geom_point()`).
On average, there appears to be some variation, with sites in which fledgeling success is quite low on average (3-4 chicks) and sites in which fledgeling success is okay-ish (with not one site reaching 8 successfully fledged chicks on average):
suc_by_site_summary <- blutidf_3yo %>% group_by(site) %>% na.omit(blutidf_3yo$suc) %>% summarise_at(vars(suc), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max))
suc_by_site_summary <- suc_by_site_summary %>% arrange(factor(site, levels = level_order), .before=mean)
suc_by_site_summary
## # A tibble: 53 × 7
## # Groups: site [44]
## site mean sd SE mode min max
## <chr> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 EDI 5.88 3.07 0.485 8 0 11
## 2 RSY 3.73 2.99 0.547 0 0 9
## 3 FOF 6.26 2.84 0.402 8 0 12
## 4 BAD 5.97 2.63 0.481 6 0 12
## 5 LVN 5.67 1.86 0.275 6 0 9
## 6 DOW 5.21 2.69 0.548 7 0 10
## 7 GLF 4.19 3.26 0.543 0 0 10
## 8 SER 6.62 2.95 0.431 9 0 11
## 9 MCH 6.74 2.80 0.584 5 0 11
## 10 PTH 7.30 2.77 0.455 7 0 12
## # ℹ 43 more rows
Again, what about proportion of successfully fledged chicks?:
bluti_suc_prop_summary <- blutidf_3yo %>%
group_by(site) %>%
na.omit(bluti_suc_summary) %>%
summarise_at(vars(suc_prop), list(mean=mean, SE=se)) %>%
as.data.frame() # summarising first egg lay date observations by calculating its mean and standard deviation
ggplot(blutidf_3yo, aes(x=site, y=suc_prop)) +
geom_jitter(size=1.5, alpha=0.25, colour = "#248fc9") +
geom_errorbar(data=bluti_suc_prop_summary, aes(x=site, y=mean, ymin=mean-SE, ymax=mean+SE)) +
geom_point(data=bluti_suc_prop_summary, aes(x=site, y=mean), colour = "black", size = 2) +
labs(x = "Sites", y = "Proportion of chicks successfully fledged") +
theme_bw() +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 30),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order) +
scale_y_continuous(breaks=seq(0,1,0.1))
## Warning: Removed 264 rows containing missing values or values outside the scale range
## (`geom_point()`).
There are some sites that apear to be quite low in comparison to others, namely RSY, DOW, DUN, DNS, DAV and maybe OSP and DOR:
suc_prop_by_site_summary <- blutidf_3yo %>% group_by(site) %>% na.omit(blutidf_3yo$suc_prop) %>% summarise_at(vars(suc_prop), list(mean=mean, sd=sd, SE=se, mode=find_mode, min=min, max=max))
suc_prop_by_site_summary <- suc_prop_by_site_summary %>% arrange(factor(site, levels = level_order), .before=mean)
suc_prop_by_site_summary
## # A tibble: 58 × 7
## # Groups: site [44]
## site mean sd SE mode min max
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EDI 0.729 0.343 0.0542 1 0 1
## 2 RSY 0.465 0.376 0.0687 0 0 1
## 3 FOF 0.741 0.309 0.0437 1 0 1
## 4 BAD 0.743 0.257 0.0470 1 0 1
## 5 LVN 0.796 0.226 0.0333 1 0 1
## 6 DOW 0.667 0.321 0.0656 1 0 1
## 7 GLF 0.531 0.389 0.0648 0 0 1
## 8 SER 0.752 0.305 0.0445 1 0 1
## 9 MCH 0.764 0.257 0.0537 1 0 1
## 10 PTH 0.791 0.251 0.0412 1 0 1
## # ℹ 48 more rows
I’ve used the file site details as a reference to understand when data collection started for each site and when (and if) new boxes were added. However, COVID year was unusual in terms of data collection and fielwork protocol (e.g. visit frequency to sites), and the Northern part of the transect has not always been extensively visited, and therefore nest occupancy for some sites and/or years might be underestimated (if there is a 0 instead of NA, for example). Therefore, I would be cautious when looking at the estimates of nest occupancy.
As another warning, I would like to apologise if my code is very inefficient and tedious as I usually don’t see the faster route straight away (if at all).
rownames(sites) <- sites$site # naming each row after the site it represents
sites$extra <- sites$dateofextraboxes # creating a new column that will store the year in which new nest boxes were added
sites$extra <- c(rep(2017,3), NA, 2017, NA, 2017, 2017, NA, rep(2017, 5), NA, rep(2017, 6), NA, NA, 2017, 2017, NA, 2017, 2017, NA, 2017, 2017, NA, rep(2017, 8), NA, 2017, NA, NA)
sites["BAD", c("X2015", "X2016")] <- 1
sites["MCH", "X2015"] <- 1
sites["PTH", "X2016"] <- 1
sites["BIR", "X2015"] <- 1
sites["DUN", "X2016"] <- 1
sites["DLW", "X2020"] <- 1
sites["CRU", "X2020"] <- 1
sites[25-44, "X2020"] <- 1
sites[38-44, "X2021"] <- 1
sites["TAI", "X2015"] <- 1
# In the previous lines I'm converting NAs into 1s as there are recordings of occupied nest boxes in those sites for those years, and therefore they were visited.
sites$X2022 <- 1
sites$X2023 <- 1
sites$X2024 <- 1
# Here I'm creating new columns for the years 2022, 2023 and 2024 as they were not included in the site_details.csv file.
sites <- sites %>% relocate(X2022:X2024, .before = extras) # re-arranging the columns
av_occupancy <- data.frame(site = sites$site, occupancy = rep(NA, nrow(sites))) # creating a new data frame that will store the average occupancy across years per site
Now, I wanted to estimate nest occupancy for those years were the site was visited and taking into account that some sites experimented an increase in the number of nest boxes at some point during the project. That is what the following bit of code does, in this case for the site EDI. However, the same code has been applied for all 44 sites in the project.
# EDI site
EDI_nb <- data.frame(year = c(2014:2024), site = rep("EDI", 11), all_nb = seq(1,11,1), occ_nb = rep(0, 11)) # creating a dataframe that will store total number of nest boxes and occupied nest boxes per year (repeated for all sites)
for (i in 1:nrow(EDI_nb)) {
if (EDI_nb[i, "year"] < sites["EDI", "extra"]) {
EDI_nb[i,"all_nb"] <- sites["EDI","Original.Boxes"]
} else if (EDI_nb[i, "year"] >= sites["EDI", "extra"]) {
EDI_nb[i,"all_nb"] <- sites["EDI","Current.Boxes"]
}
} # the aim of this loop is to correctly assign the number of nest boxes present in the site for each year taking into account that the number of nest boxes was increased for most sites, but not all.
row <- 0
for (i in 10:17) {
row <- row + 1
if (is.na(sites["EDI",i])) {
EDI_nb[row, "all_nb"] <- NA
}
} # the aim of this loop is to assign NA to nest occupancy for years in which either nest boxes were not yet installed or they were not checked.
blutiEDI <- blutidf[which(blutidf$site == "EDI"),] %>% count(year) # selecting all recordings in my dataset that cover the site of interest (in this case, EDI) for the years of interest (between 2014 and 2021). Assuming that there is one recording per nest box, per site, per year, by counting the numbers of recordings per site and per year I can estimate the number of nest boxes occupied by blue tits.
for (i in 1:nrow(EDI_nb)) {
if (EDI_nb[i,"year"] %in% blutiEDI$year) {
EDI_nb[i,"occ_nb"] <- blutiEDI[which(blutiEDI$year == EDI_nb[i,"year"]),"n"]
} else {
EDI_nb[i,"occ_nb"] <- 0
}
} # adding the number of recorded nest boxes (per year) to the data frame defined previously
EDI_nb$occupancy <- EDI_nb$occ_nb/EDI_nb$all_nb # calculating occupancy as the proportion of occupied nest boxes out of the total number of nest boxes present
av_occupancy[which(av_occupancy$site == "EDI"),"occupancy"] <- mean(na.omit(EDI_nb$occupancy)) # calculating the average occupancy rate and storing it in a separate data frame
After doing this for all sites, the result is the following file:
site_occupancy <- read.csv("C:/Users/julia/OneDrive - University of Edinburgh/EEB_MSc_UEd/Dissertation/diss/data/site_occupancy.csv")
site_occupancy # this file stores the average occupancy per site across years
## X site occupancy
## 1 1 EDI 0.8809524
## 2 2 RSY 0.6547619
## 3 3 FOF 0.8802083
## 4 4 BAD 0.6458333
## 5 5 LVN 0.6666667
## 6 6 DOW 0.8000000
## 7 7 GLF 0.7291667
## 8 8 SER 0.9047619
## 9 9 MCH 0.5208333
## 10 10 PTH 0.7447917
## 11 11 STY 0.6302083
## 12 12 BIR 0.4739583
## 13 13 DUN 0.2864583
## 14 14 BLG 0.7500000
## 15 15 PIT 0.8250000
## 16 16 KCK 0.4218750
## 17 17 KCZ 0.5119048
## 18 18 BLA 0.4375000
## 19 19 CAL 0.2708333
## 20 20 DNM 0.3125000
## 21 21 DNC 0.1145833
## 22 22 DNS 0.1785714
## 23 23 DLW 0.2083333
## 24 24 CRU 0.2239583
## 25 25 NEW 0.4427083
## 26 26 HWP 0.3250000
## 27 27 INS 0.2708333
## 28 28 FSH 0.4947917
## 29 29 RTH 0.3095238
## 30 30 AVI 0.4583333
## 31 31 AVN 0.7321429
## 32 32 CAR 0.3958333
## 33 33 SLS 0.3571429
## 34 34 TOM 0.1250000
## 35 35 DAV 0.4322917
## 36 36 ART 0.6527778
## 37 37 MUN 0.4791667
## 38 38 FOU 0.3593750
## 39 39 ALN 0.4843750
## 40 40 DEL 0.6927083
## 41 41 TAI 0.5625000
## 42 42 SPD 0.2678571
## 43 43 OSP 0.2500000
## 44 44 DOR 0.4791667
Then, I could add these occupancy rates to both of my databases:
environment <- data.frame(sites = sites$site, longitude = sites$Mean.Long, latitude = sites$Mean.Lat, elevation = sites$Mean.Elev) # creating a new data frame that will store all potential environment variables
environment$site_occ <- site_occupancy$occupancy[match(environment$site, site_occupancy$site)]
# Then, I could also add it to both my data bases
blutidf$site_occ <- site_occupancy$occupancy[match(blutidf$site, site_occupancy$site)]
blutidf_3yo$site_occ <- site_occupancy$occupancy[match(blutidf_3yo$site, site_occupancy$site)]
as_tibble(blutidf)
## # A tibble: 1,657 × 15
## X ring year site box fed cs suc suc_prop hatchyear age
## <int> <chr> <int> <chr> <int> <int> <int> <int> <dbl> <int> <int>
## 1 1 Z081142 2014 FOF 1 114 11 10 0.909 2012 6
## 2 2 Z081712 2014 FOF 2 126 8 7 0.875 2013 5
## 3 3 Z081138 2014 FOF 3 111 9 6 0.667 2013 5
## 4 4 Z081141 2014 FOF 5 115 8 6 0.75 2012 6
## 5 5 Z081040 2014 FOF 6 109 7 5 0.714 2013 5
## 6 6 Z081146 2014 BAD 3 113 11 11 1 2012 6
## 7 7 Z081069 2014 BAD 5 110 10 8 0.8 2013 5
## 8 8 Z081687 2014 LVN 1 124 7 7 1 2013 5
## 9 9 Z081715 2014 LVN 2 124 10 8 0.8 2012 6
## 10 10 L892095 2014 LVN 3 126 8 7 0.875 2012 6
## # ℹ 1,647 more rows
## # ℹ 4 more variables: yo <int>, w <int>, case <int>, site_occ <dbl>
as_tibble(blutidf_3yo)
## # A tibble: 1,518 × 15
## X ring year site box fed cs suc suc_prop hatchyear age
## <int> <chr> <int> <chr> <int> <int> <int> <int> <dbl> <int> <int>
## 1 1 Z081142 2014 FOF 1 114 11 10 0.909 2012 6
## 2 2 Z081712 2014 FOF 2 126 8 7 0.875 2013 5
## 3 3 Z081138 2014 FOF 3 111 9 6 0.667 2013 5
## 4 4 Z081141 2014 FOF 5 115 8 6 0.75 2012 6
## 5 5 Z081040 2014 FOF 6 109 7 5 0.714 2013 5
## 6 6 Z081146 2014 BAD 3 113 11 11 1 2012 6
## 7 7 Z081069 2014 BAD 5 110 10 8 0.8 2013 5
## 8 8 Z081687 2014 LVN 1 124 7 7 1 2013 5
## 9 9 Z081715 2014 LVN 2 124 10 8 0.8 2012 6
## 10 10 L892095 2014 LVN 3 126 8 7 0.875 2012 6
## # ℹ 1,508 more rows
## # ℹ 4 more variables: yo <int>, w <int>, case <int>, site_occ <dbl>
ggplot(environment, aes(x=sites, y=site_occ)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Average occupancy") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
Average nest box occupation does not seem to follow a latitudinal pattern, but it is clear that some sites have very low occupancy (DNM, DNC, DNS, DLW, SLS…) while others have a high occupancy (EDI, FOF, GLF…). However, it might not be correlated in any way to breeding success.
According to Shutt et al. (2018), “Total foliage, oak, sycamore and tree diversity all appear to decrease at higher elevations, with birch and willow displaying the opposite trend”. It could potentially be used as an environmental predictor, but the biggest drawback that I see for it is that as birch and willow foliage/availability increase with elevation, and their presence might compensate (buffer) oak and sycamore decreased availability (and thus, their resources’) absence. Still, I tried to explore it a little bit.
blutidf$elevation <- environment$elevation[match(blutidf$site, environment$sites)]
blutidf_3yo$elevation <- environment$elevation[match(blutidf_3yo$site, environment$sites)] # adding elevation to both databases
as_tibble(blutidf)
## # A tibble: 1,657 × 16
## X ring year site box fed cs suc suc_prop hatchyear age
## <int> <chr> <int> <chr> <int> <int> <int> <int> <dbl> <int> <int>
## 1 1 Z081142 2014 FOF 1 114 11 10 0.909 2012 6
## 2 2 Z081712 2014 FOF 2 126 8 7 0.875 2013 5
## 3 3 Z081138 2014 FOF 3 111 9 6 0.667 2013 5
## 4 4 Z081141 2014 FOF 5 115 8 6 0.75 2012 6
## 5 5 Z081040 2014 FOF 6 109 7 5 0.714 2013 5
## 6 6 Z081146 2014 BAD 3 113 11 11 1 2012 6
## 7 7 Z081069 2014 BAD 5 110 10 8 0.8 2013 5
## 8 8 Z081687 2014 LVN 1 124 7 7 1 2013 5
## 9 9 Z081715 2014 LVN 2 124 10 8 0.8 2012 6
## 10 10 L892095 2014 LVN 3 126 8 7 0.875 2012 6
## # ℹ 1,647 more rows
## # ℹ 5 more variables: yo <int>, w <int>, case <int>, site_occ <dbl>,
## # elevation <int>
as_tibble(blutidf_3yo)
## # A tibble: 1,518 × 16
## X ring year site box fed cs suc suc_prop hatchyear age
## <int> <chr> <int> <chr> <int> <int> <int> <int> <dbl> <int> <int>
## 1 1 Z081142 2014 FOF 1 114 11 10 0.909 2012 6
## 2 2 Z081712 2014 FOF 2 126 8 7 0.875 2013 5
## 3 3 Z081138 2014 FOF 3 111 9 6 0.667 2013 5
## 4 4 Z081141 2014 FOF 5 115 8 6 0.75 2012 6
## 5 5 Z081040 2014 FOF 6 109 7 5 0.714 2013 5
## 6 6 Z081146 2014 BAD 3 113 11 11 1 2012 6
## 7 7 Z081069 2014 BAD 5 110 10 8 0.8 2013 5
## 8 8 Z081687 2014 LVN 1 124 7 7 1 2013 5
## 9 9 Z081715 2014 LVN 2 124 10 8 0.8 2012 6
## 10 10 L892095 2014 LVN 3 126 8 7 0.875 2012 6
## # ℹ 1,508 more rows
## # ℹ 5 more variables: yo <int>, w <int>, case <int>, site_occ <dbl>,
## # elevation <int>
ggplot(environment, aes(x=sites, y=elevation)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Average site elevation") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
Although very roughly, there may be some degree of correlation between average nest occupancy and average site elevation:
ggplot(environment, aes(x=elevation, y=site_occ)) +
geom_point() +
theme_bw() +
labs(x = "Elevation", y = "Average occupancy") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
It appears that as elevation increases, nest occupancy decreases. Although it might not be directly relevant to environment characterisation, it will most likely be something interesting to take into account as low nest occupancy might imply less competition over available resources, and therefore this could compensate for a bad quality site (e.g. if the site provides poor resources, but these resources have to be split only among a few birds, they could potentially perform better than birds that live in a very good quality site but that have to compete with more birds for access to the available resources, especially if they are not good competitors).
As we’ve discussed before, there’s different ways in which tree composition could be modelled. Here I explore different option from less to most complex (and, potentially, comprehensive).
habitat <- read.csv("C:/Users/julia/OneDrive - University of Edinburgh/EEB_MSc_UEd/Dissertation/diss/data/Habitats.csv") # loading the file that stores the count of trees by nestbox and site
as_tibble(habitat)
## # A tibble: 335 × 80
## site nestbox recorder radius X6_alder m_alder s_alder l_ash m_ash s_ash
## <chr> <int> <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 EDI 1 MB 15 NA NA NA NA NA NA
## 2 EDI 2 MB 15 NA NA NA NA NA NA
## 3 EDI 3 MB 15 NA NA NA NA NA NA
## 4 EDI 4 MB 15 NA NA NA NA NA 2
## 5 EDI 5 MB 15 NA NA NA NA NA NA
## 6 EDI 6 MB 15 NA NA NA NA NA NA
## 7 RSY 1 MB 15 NA NA NA NA 3 7
## 8 RSY 2 MB 15 NA NA NA NA 2 6
## 9 RSY 3 MB 15 NA NA NA NA NA 13
## 10 RSY 4 MB 15 NA NA NA NA 11 7
## # ℹ 325 more rows
## # ℹ 70 more variables: m_aspen <int>, s_aspen <int>, l_beech <int>,
## # m_beech <int>, s_beech <int>, X21_birch <int>, X6_birch <int>,
## # m_birch <int>, s_birch <int>, X21_blackthorn <int>, X6_cherry <int>,
## # m_cherry <int>, s_cherry <int>, m_chestnut <int>, s_chestnut <int>,
## # l_conifer <int>, m_conifer <int>, s_conifer <int>, X6_elder <int>,
## # s_elder <int>, l_elm <int>, m_elm <int>, s_elm <int>, s_hawthorn <int>, …
This file stores tree composition within 15 m ratius for all nest boxes and all sites, classifying according to the following categories: * Trees: + Small: trunk girth at brest height (gbh) is between 40 and 99 mm. + Medium: 100-249 mm trunk gbh. + Large: >250 mm trunk gbh. * Stands: category for vegetation (e.g. shrubs) that would not fit within the tree category but that still provides exploitable resources. + Stands 6-20: shrub stands from which 6-20 branches split within 20 cm of their base. They visually estimated each of these stands is equivalent to 0.5 trees. + Stands 21+: >20 branches split within 20 cm of the base of the stands. They visually estimated each of these stands is equivalent to 1 small tree.
Using these recordings, I estimated the following predictors:
Since oaks seem to offer the most resources for blue tits, sites where the number of oaks is higher could be interpreted as better sites. However, this wouldn’t allow us to explore how other tree geni predict all our age-specific breeding trait, and according to Shutt et al. (2018) and Macphie et al. (2025), we know that taxa like birch, sycamore or willow positively impact fledgeling success and/or caterpillar availability for blue tits across the whole breeding season, and therefore we would be missing key information.
oak_sites <- habitat %>% select(site, nestbox, l_oak, m_oak, s_oak)
oak_sites <- oak_sites %>% replace_na(list(l_oak = 0, m_oak = 0, s_oak = 0))
oak_sites$TOTAL_oak <- 0
for (i in 1:nrow(oak_sites)) {
oak_sites[i,"TOTAL_oak"] <- sum(c(oak_sites[i,"s_oak"], oak_sites[i,"m_oak"], oak_sites[i,"l_oak"]))
}
for (i in 1:nrow(environment)) {
environment[i,"TOTAL_oak"] <- sum(oak_sites[which(oak_sites$site == environment[i,"sites"]),"TOTAL_oak"])
}
plot1 <- ggplot(environment, aes(x=sites, y=TOTAL_oak)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Absolute number of oaks") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
plot1
Here we can already see that there’s quite a big variation between sites
based solely on the absolute number of oaks present. However, there are
quite a few sites that have no oaks available, and in terms of this
variable, they are identical in environmental conditions, which is not
accurate.
ggplot(environment, aes(x=elevation, y=TOTAL_oak)) +
geom_point() +
theme_bw() +
labs(x = "Elevation", y = "Absolute number of oaks") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
Although sites with a greater number of oaks are usually of low-mid elevation according to this graph, it might be too vague:
lm2 <- lm(TOTAL_oak ~ elevation, data = environment)
summary(lm2)
##
## Call:
## lm(formula = TOTAL_oak ~ elevation, data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.490 -15.389 -10.111 2.656 92.689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.51201 7.11449 3.445 0.00131 **
## elevation -0.04445 0.03748 -1.186 0.24231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.05 on 42 degrees of freedom
## Multiple R-squared: 0.0324, Adjusted R-squared: 0.009364
## F-statistic: 1.406 on 1 and 42 DF, p-value: 0.2423
ggplot(environment, aes(x=elevation, y=TOTAL_oak)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Elevation", y = "Absolute number of oaks") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
ggplot(environment, aes(x=TOTAL_oak, y=site_occ)) +
geom_point() +
theme_bw() +
labs(x = "Absolute number of oaks", y = "Average occupancy") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
At first glance, there does not appear to be a particularly strong correlation between the number of oak trees present at a site and its average occupancy.
lm3 <- lm(site_occ ~ TOTAL_oak, data = environment)
summary(lm3)
##
## Call:
## lm(formula = site_occ ~ TOTAL_oak, data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34479 -0.16697 -0.01927 0.16931 0.42518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.459378 0.038105 12.056 3.18e-15 ***
## TOTAL_oak 0.001837 0.001154 1.592 0.119
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2133 on 42 degrees of freedom
## Multiple R-squared: 0.05689, Adjusted R-squared: 0.03444
## F-statistic: 2.534 on 1 and 42 DF, p-value: 0.1189
ggplot(environment, aes(x=TOTAL_oak, y=site_occ)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Absolute number of oaks", y = "Average occupancy") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
habitat <- habitat %>% replace(is.na(.), 0) # replacing all NAs for 0s
# Creating a dataframe summarising the number of trees per genus recorded at each nestbox, ignoring their girth
trees_per_box <- data.frame(sites = habitat$site, box = habitat$nestbox, alder = rep(0, nrow(habitat)), ash = rep(0, nrow(habitat)), aspen = rep(0, nrow(habitat)), beech = rep(0, nrow(habitat)), birch = rep(0, nrow(habitat)), blackthorn = rep(0, nrow(habitat)), cherry = rep(0, nrow(habitat)), chestnut = rep(0, nrow(habitat)), conifer = rep(0, nrow(habitat)), elder = rep(0, nrow(habitat)), elm = rep(0, nrow(habitat)), hawthorn = rep(0, nrow(habitat)), hazel = rep(0, nrow(habitat)), holly = rep(0, nrow(habitat)), juniper = rep(0, nrow(habitat)), lime = rep(0, nrow(habitat)), oak = rep(0, nrow(habitat)), pine = rep(0, nrow(habitat)), rose = rep(0, nrow(habitat)), rowan = rep(0, nrow(habitat)), sycamore = rep(0, nrow(habitat)), whitebeam = rep(0, nrow(habitat)), willow = rep(0, nrow(habitat)), yew = rep(0, nrow(habitat)), other = rep(0, nrow(habitat)))
trees_per_box$alder <- habitat$X6_alder + habitat$m_alder + habitat$s_alder # calculating the number of each tree genus by adding recordings of the same tree genus of different girth
trees_per_box$ash <- habitat$l_ash + habitat$m_ash + habitat$s_ash
trees_per_box$aspen <- habitat$m_aspen + habitat$s_aspen
trees_per_box$beech <- habitat$l_beech + habitat$m_beech + habitat$s_beech + habitat$X6_beech
trees_per_box$birch <- habitat$l_birch + habitat$m_birch + habitat$s_birch + habitat$X21_birch + habitat$X6_birch
trees_per_box$blackthorn <- habitat$z_blackthorn + habitat$X21_blackthorn
trees_per_box$cherry <- habitat$l_cherry + habitat$m_cherry + habitat$s_cherry + habitat$z_cherry + habitat$X6_cherry
trees_per_box$chestnut <- habitat$m_chestnut + habitat$s_chestnut
trees_per_box$conifer <- habitat$l_conifer + habitat$m_conifer + habitat$s_conifer
trees_per_box$elder <- habitat$s_elder + habitat$X6_elder
trees_per_box$elm <- habitat$l_elm + habitat$m_elm + habitat$s_elm
trees_per_box$hawthorn <- habitat$s_hawthorn
trees_per_box$hazel <- habitat$s_hazel + habitat$X6_hazel + habitat$X21_hazel
trees_per_box$holly <- habitat$m_holly + habitat$s_holly + habitat$X6_holly + habitat$X21_holly
trees_per_box$juniper <- habitat$s_juniper + habitat$X6_juniper
trees_per_box$lime <- habitat$m_lime + habitat$s_lime
trees_per_box$oak <- habitat$l_oak + habitat$m_oak + habitat$s_oak
trees_per_box$pine <- habitat$l_pine + habitat$m_pine + habitat$s_pine
trees_per_box$rose <- habitat$X6_rose
trees_per_box$rowan <- habitat$m_rowan + habitat$s_rowan + habitat$X6_rowan
trees_per_box$sycamore <- habitat$s_sycamore + habitat$l_sycamore + habitat$m_sycamore + habitat$X6_sycamore
trees_per_box$whitebeam <- habitat$m_whitebeam + habitat$s_whitebeam
trees_per_box$willow <- habitat$l_willow + habitat$m_willow + habitat$s_willow + habitat$z_willow + habitat$X6_willow + habitat$X21_willow
trees_per_box$yew <- habitat$m_yew + habitat$s_yew
trees_per_box$other <- habitat$s_other + habitat$z_other
# Calculating the total number of trees recorded at each nest box, irrespective of their genus
trees_per_box$total_trees <- trees_per_box$alder + trees_per_box$ash + trees_per_box$aspen + trees_per_box$beech + trees_per_box$birch + trees_per_box$blackthorn + trees_per_box$cherry + trees_per_box$chestnut + trees_per_box$conifer + trees_per_box$elder + trees_per_box$elm + trees_per_box$hawthorn + trees_per_box$hazel + trees_per_box$holly + trees_per_box$juniper + trees_per_box$lime + trees_per_box$oak + trees_per_box$pine + trees_per_box$rose + trees_per_box$rowan + trees_per_box$sycamore + trees_per_box$whitebeam + trees_per_box$willow + trees_per_box$yew + trees_per_box$other
# Creating a dataframe to store all trees per site rather than per nest box
trees <- data.frame(sites = sites$site, alder = rep(0, nrow(sites)), ash = rep(0, nrow(sites)), aspen = rep(0, nrow(sites)), beech = rep(0, nrow(sites)), birch = rep(0, nrow(sites)), blackthorn = rep(0, nrow(sites)), cherry = rep(0, nrow(sites)), chestnut = rep(0, nrow(sites)), conifer = rep(0, nrow(sites)), elder = rep(0, nrow(sites)), elm = rep(0, nrow(sites)), hawthorn = rep(0, nrow(sites)), hazel = rep(0, nrow(sites)), holly = rep(0, nrow(sites)), juniper = rep(0, nrow(sites)), lime = rep(0, nrow(sites)), oak = rep(0, nrow(sites)), pine = rep(0, nrow(sites)), rose = rep(0, nrow(sites)), rowan = rep(0, nrow(sites)), sycamore = rep(0, nrow(sites)), whitebeam = rep(0, nrow(sites)), willow = rep(0, nrow(sites)), yew = rep(0, nrow(sites)), other = rep(0, nrow(sites)))
for (i in 1:nrow(trees)) {
trees[i,"alder"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"alder"])
} # adding all alders at all nest boxes per site
for (i in 1:nrow(trees)) {
trees[i,"ash"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"ash"])
} # ash
for (i in 1:nrow(trees)) {
trees[i,"aspen"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"aspen"])
} # aspen
for (i in 1:nrow(trees)) {
trees[i,"beech"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"beech"])
} # beech
for (i in 1:nrow(trees)) {
trees[i,"birch"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"birch"])
} # birch
for (i in 1:nrow(trees)) {
trees[i,"cherry"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"cherry"])
} # cherry
for (i in 1:nrow(trees)) {
trees[i,"blackthorn"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"blackthorn"])
} # blackthorn
for (i in 1:nrow(trees)) {
trees[i,"chestnut"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"chestnut"])
} # chestnut
for (i in 1:nrow(trees)) {
trees[i,"conifer"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"conifer"])
} # conifer
for (i in 1:nrow(trees)) {
trees[i,"elder"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"elder"])
} # elder
for (i in 1:nrow(trees)) {
trees[i,"elm"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"elm"])
} # elm
for (i in 1:nrow(trees)) {
trees[i,"hawthorn"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"hawthorn"])
} # hawthorn
for (i in 1:nrow(trees)) {
trees[i,"hazel"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"hazel"])
} # hazel
for (i in 1:nrow(trees)) {
trees[i,"holly"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"holly"])
} # holly
for (i in 1:nrow(trees)) {
trees[i,"juniper"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"juniper"])
} # juniper
for (i in 1:nrow(trees)) {
trees[i,"lime"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"lime"])
} # lime
for (i in 1:nrow(trees)) {
trees[i,"oak"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"oak"])
} # oak
for (i in 1:nrow(trees)) {
trees[i,"pine"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"pine"])
} # pine
for (i in 1:nrow(trees)) {
trees[i,"rose"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"rose"])
} # rose
for (i in 1:nrow(trees)) {
trees[i,"rowan"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"rowan"])
} # rowan
for (i in 1:nrow(trees)) {
trees[i,"sycamore"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"sycamore"])
} # sycamore
for (i in 1:nrow(trees)) {
trees[i,"whitebeam"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"whitebeam"])
} # whitebeam
for (i in 1:nrow(trees)) {
trees[i,"willow"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"willow"])
} # willow
for (i in 1:nrow(trees)) {
trees[i,"yew"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"yew"])
} # yew
for (i in 1:nrow(trees)) {
trees[i,"other"] <- sum(trees_per_box[which(trees_per_box$sites == trees[i,"sites"]),"other"])
} # other
trees$total_trees <- trees$alder + trees$ash + trees$aspen + trees$beech + trees$birch + trees$blackthorn + trees$cherry + trees$chestnut + trees$conifer + trees$elder + trees$elm + trees$hawthorn + trees$hazel + trees$holly + trees$juniper +trees$lime + trees$oak + trees$pine + trees$rose + trees$rowan + trees$sycamore + trees$whitebeam + trees$willow + trees$yew + trees$other
trees$oak_proportion <- trees$oak/trees$total_trees
environment$proportion_oak <- trees$oak_proportion
plot2 <- ggplot(environment, aes(x=sites, y=proportion_oak)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Proportion of oak trees") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
plot2
There appear to be no great differences in habitat characterisation between using absolute number of oaks and their proportion with respect to the rest of the trees present.
lm4 <- lm(proportion_oak ~ TOTAL_oak, data = environment)
summary(lm4)
##
## Call:
## lm(formula = proportion_oak ~ TOTAL_oak, data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12321 -0.01247 0.01350 0.01350 0.17565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0134969 0.0090741 -1.487 0.144
## TOTAL_oak 0.0075674 0.0002748 27.541 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05078 on 42 degrees of freedom
## Multiple R-squared: 0.9475, Adjusted R-squared: 0.9463
## F-statistic: 758.5 on 1 and 42 DF, p-value: < 2.2e-16
ggplot(environment, aes(x=TOTAL_oak, y=proportion_oak)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Absolute number of oaks", y = "Average occupancy") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
And as one would expect, absolute number of oaks and
proportion of oaks are strongly positively correlated,
which (I assume) implies that the use of either one of them should not
deliver too-different results
The purpose of adding oak, birch and sycamore trees together is to hopefully capture the buffering role of birch and sycamore.
trees$oak_birch_sycamore <- trees$birch + trees$oak + trees$sycamore
environment$oak_birch_sycamore <- trees$oak_birch_sycamore
plot3 <- ggplot(environment, aes(x=sites, y=oak_birch_sycamore)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Oaks, birchs and sycamores combined") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
This combination appears to capture site differences better between those sites that had no oak availability.
ggplot(environment, aes(x=elevation, y=oak_birch_sycamore)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Elevation", y = "Absolute number of oaks") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
In contrast to the Absolute number of oak vs. Elevation
plot, the sum of oak, birch and sycamore trees doesn’t appear to be
correlated with elevation.
trees$proportion_oak_birch_sycamore <- trees$oak_birch_sycamore/trees$total_trees
environment$proportion_oak_birch_sycamore <- trees$proportion_oak_birch_sycamore
plot4 <- ggplot(environment, aes(x=sites, y=proportion_oak_birch_sycamore)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Proportion of oaks, birchs and sycamores combined") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
plot4
lm5 <- lm(proportion_oak_birch_sycamore ~ oak_birch_sycamore, data = environment)
summary(lm5)
##
## Call:
## lm(formula = proportion_oak_birch_sycamore ~ oak_birch_sycamore,
## data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29697 -0.14353 -0.00327 0.09097 0.50938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2236737 0.0556218 4.021 0.000236 ***
## oak_birch_sycamore 0.0032555 0.0004639 7.018 1.37e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1818 on 42 degrees of freedom
## Multiple R-squared: 0.5397, Adjusted R-squared: 0.5288
## F-statistic: 49.25 on 1 and 42 DF, p-value: 1.369e-08
ggplot(environment, aes(x=oak_birch_sycamore, y=proportion_oak_birch_sycamore)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Absolute number of oak, sycamore and birch trees (combined)", y = "Proportion of oak, sycamore and birch trees (combined)") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
Again as expected,
absolute number of oak, brich and sycamore trees (combined)
and proportion of oak, sycamore and birch trees (combined)
are also positively correlated.
library(vegan)
## Cargando paquete requerido: permute
## Cargando paquete requerido: lattice
trees <- trees %>% relocate(c(conifer, other), .after = yew)
# I will be using the Simpson's Index to calculate tree diversity, although I could use other index.
tree.simp <- diversity(trees[,2:24],index = "simpson") # I'm excluding the columns "other" and "conifers" because I don't exactly know what taxa they comprise and whether said taxa (or some of it) are already present in other columns (f.e. pine trees)
environment$tree_diversity_simpson <- tree.simp
plot5 <- ggplot(environment, aes(x=sites, y=tree_diversity_simpson)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Tree diversity") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
plot5
The picture offered by tree diversity looks different to the ones
shown so far (if at all, it share more similarities with the
Proportion of oak, birch and sycamore trees combined):
grid.arrange(plot2, plot4, plot5, ncol=1, nrow = 3)
lm6 <- lm(tree_diversity_simpson ~ proportion_oak_birch_sycamore, data = environment)
summary(lm6)
##
## Call:
## lm(formula = tree_diversity_simpson ~ proportion_oak_birch_sycamore,
## data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49187 -0.07106 0.04665 0.12555 0.25113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.85459 0.06312 13.540 < 2e-16 ***
## proportion_oak_birch_sycamore -0.48437 0.10160 -4.767 2.26e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1765 on 42 degrees of freedom
## Multiple R-squared: 0.3511, Adjusted R-squared: 0.3357
## F-statistic: 22.73 on 1 and 42 DF, p-value: 2.258e-05
ggplot(environment, aes(x=proportion_oak_birch_sycamore, y=tree_diversity_simpson)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Proportion of oak, sycamore and birch trees (combined)", y = "Tree diversity") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
Interestingly enough, as the proportion of oak, sycamore and birch trees (combined) increases, tree diversity decreases. This is expected as some sites are dominated by either of these geni (e.g. PIT is overwhelmingly dominated by oak trees). Now the question would be whether tree diversity is not really positive if oak, birch and sycamore abundance is very low (as these are the trees that have the most positive impact in breeding traits).
lm7 <- lm(tree_diversity_simpson ~ elevation, data = environment)
summary(lm7)
##
## Call:
## lm(formula = tree_diversity_simpson ~ elevation, data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53211 -0.10693 0.04404 0.12837 0.33838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7160474 0.0492103 14.551 < 2e-16 ***
## elevation -0.0008801 0.0002593 -3.395 0.00151 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.194 on 42 degrees of freedom
## Multiple R-squared: 0.2153, Adjusted R-squared: 0.1966
## F-statistic: 11.52 on 1 and 42 DF, p-value: 0.001511
ggplot(environment, aes(x=elevation, y=tree_diversity_simpson)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Elevation", y = "Tree diversity") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
Tree diversity decreases as elevation increases.
lm8 <- lm(site_occ ~ tree_diversity_simpson, data = environment)
summary(lm8)
##
## Call:
## lm(formula = site_occ ~ tree_diversity_simpson, data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4189 -0.1859 0.0064 0.1459 0.3975
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.33436 0.09231 3.622 0.000782 ***
## tree_diversity_simpson 0.27089 0.14893 1.819 0.076065 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2114 on 42 degrees of freedom
## Multiple R-squared: 0.07302, Adjusted R-squared: 0.05095
## F-statistic: 3.308 on 1 and 42 DF, p-value: 0.07606
ggplot(environment, aes(x=tree_diversity_simpson, y=site_occ)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Tree diversity", y = "Nest occupancy") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
Although the points are quite scattered (the model fit is not great; R^2 = 0.073018), there seems to be a positive correlation between tree diversity and average nest box occupancy: in sites were tree composition is more varied, nest boxes are usually occupied. As we don’t know the relation between either of these variables and any of the breeding traits, we cannot infer much more (in regard to breeding success).
Now, I’ll use foliage scores. Unfortunately, I haven’t been able to calculate them myself (I must be misunderstanding how Shutt et al. (2018) did the calculation), therefore I’m using the values found in the Habitat_Site.csv:
habitat_foliage_scores <- read.csv("C:/Users/julia/OneDrive - University of Edinburgh/EEB_MSc_UEd/Dissertation/diss/data/Habitat_SiteII.csv")
as_tibble(habitat_foliage_scores)
## # A tibble: 44 × 24
## Site Alder_FS Ash_FS Aspen_FS Beech_FS Birch_FS Elm_FS Oak_FS Sycamore_FS
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALN 6.5 0 0 0 67.3 0 0 0
## 2 ART 0 0.75 0 3.59 8.5 0 50.1 0
## 3 AVI 0 0 0 0 37.0 0 0 0
## 4 AVN 0 0 7.03 0 2.78 0 69.8 0
## 5 BAD 0 6.04 0 17.1 9.54 0 0 9.5
## 6 BIR 1.41 1.41 0 0.125 12.0 0 28.3 9.34
## 7 BLA 2.97 5.59 0.906 16.5 20.4 0.125 0 0
## 8 BLG 0.781 14.2 0 0.125 4.94 4.97 20.6 31.9
## 9 CAL 3.28 0 0 0 35.0 0 0 0.125
## 10 CAR 0 0 20.0 0 27.1 0 0 0
## # ℹ 34 more rows
## # ℹ 15 more variables: Willow_FS <dbl>, Conifer_FS <dbl>, OthDecid_FS <dbl>,
## # Total <dbl>, Alder_prop <dbl>, Ash_prop <dbl>, Aspen_prop <dbl>,
## # Beech_prop <dbl>, Birch_prop <dbl>, Elm_prop <dbl>, Oak_prop <dbl>,
## # Sycamore_prop <dbl>, Willow_prop <dbl>, Conifer_prop <dbl>,
## # OthDecid_prop <dbl>
First, I would like to recreate Figure 2 in Shutt et al. (2018) were they represent different foliage scores per sites:
habitat_foliage_scores <- habitat_foliage_scores %>% arrange(factor(Site, levels = level_order))
fs <- habitat_foliage_scores[,1:12] # selecting columns with absolute foliage scores
score <- fs %>% pivot_longer(cols=c(Alder_FS, Ash_FS, Aspen_FS, Beech_FS, Birch_FS, Elm_FS, Oak_FS, Sycamore_FS, Willow_FS, Conifer_FS, OthDecid_FS), names_to = "foliage_score")
ggplot(score, aes(x=Site, y=value, fill = foliage_score)) +
geom_col(colour="black") +
theme_bw() +
labs(x = "Sites", y = "Foliage score") +
guides(fill=guide_legend(title="Foliage score")) +
scale_fill_hue(labels = c("Alder", "Ash", "Aspen", "Beech", "Birch", "Conifer", "Elm", "Oak", "Other deciduous", "Sycamore", "Willow")) +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
This plot serves to show the high habitat variability along the transect of the project. We can appreciate sites where foliage cover is dominated by one or two species (e.g. EDI, TOM,PIT) and sites were foliage cover is more or less evenly spread among several tree geni (e.g. DEL).
Is this wide variety also represented by solely counting trees?:
treex <- trees[,1:26]
treex <- treex %>% mutate(other_dec = blackthorn + cherry + chestnut + elder + hawthorn + hazel + holly + lime + rose + rowan + whitebeam + other, conifers = juniper + pine + yew + conifer) %>%
select(sites, alder, ash, aspen, beech, birch, conifers, elm, oak, sycamore, willow, other_dec)
treex_long <- treex %>% pivot_longer(cols=c(alder, ash, aspen, beech, birch, conifers, elm, oak, sycamore, willow, other_dec), names_to = "taxon")
ggplot(treex_long, aes(x=sites, y=value, fill = taxon)) +
geom_col(colour="black") +
theme_bw() +
labs(x = "Sites", y = "Number of trees") +
guides(fill=guide_legend(title="Taxa")) +
scale_fill_hue(labels = c("Alder", "Ash", "Aspen", "Beech", "Birch", "Conifers", "Elm", "Oak", "Other deciduous", "Sycamore", "Willow")) +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
From a first glance to both plots, I’d say that (as should be expected) using count may either overestimate the effect of certain trees in a site (e.g. in BIR there appear to be >50 sycamore trees but their cover appears to be quite low, according to the foliage score) or underestimate it (e.g. could apply to sycamore in EDI). However, it’s not easy to tell as in both cases we’re evaluating absolute numbers, and it might not be proper to compare absolute foliage score with absolute number of trees. If instead we compare contribution of each taxa to tree composition according to proportion of trees and to proportion of foliage score we might get a better idea:
fs_prop <- habitat_foliage_scores[,c(1, 14:24)] # selecting columns with absolute foliage scores
score_prop <- fs_prop %>% pivot_longer(cols=c(Alder_prop, Ash_prop, Aspen_prop, Beech_prop, Birch_prop, Elm_prop, Oak_prop, Sycamore_prop, Willow_prop, Conifer_prop, OthDecid_prop), names_to = "prop_foliage_score")
plot6 <- ggplot(score_prop, aes(x=Site, y=value, fill = prop_foliage_score)) +
geom_col(colour="black") +
theme_bw() +
labs(x = "Sites", y = "Proportional oliage score") +
guides(fill=guide_legend(title="Foliage score")) +
scale_fill_hue(labels = c("Alder", "Ash", "Aspen", "Beech", "Birch", "Conifer", "Elm", "Oak", "Other deciduous", "Sycamore", "Willow")) +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
treex_prop <- treex %>%
mutate(total = alder + ash + aspen + beech + birch + conifers + elm + oak + other_dec + sycamore + willow) %>%
mutate(alder_prop = alder/total, ash_prop = ash/total, aspen_prop = aspen/total, beech_prop = beech/total, birch_prop = birch/total, conifers_prop = conifers/total, elm_prop = elm/total, oak_prop = oak/total, other_dec_prop = other_dec/total, sycamore_prop = sycamore/total, willow_prop = willow/total) %>%
select(sites, alder_prop, ash_prop, aspen_prop, beech_prop, birch_prop, conifers_prop, elm_prop, oak_prop, other_dec_prop, sycamore_prop, willow_prop)
treex_prop_long <- treex_prop %>% pivot_longer(cols=c(alder_prop, ash_prop, aspen_prop, beech_prop, birch_prop, conifers_prop, elm_prop, oak_prop, other_dec_prop, sycamore_prop, willow_prop), names_to = "prop")
plot7 <- ggplot(treex_prop_long, aes(x=sites, y=value, fill = prop)) +
geom_col(colour="black") +
theme_bw() +
labs(x = "Sites", y = "Proportion of each taxa contribution") +
guides(fill=guide_legend(title="Foliage score")) +
scale_fill_hue(labels = c("Alder", "Ash", "Aspen", "Beech", "Birch", "Conifer", "Elm", "Oak", "Other deciduous", "Sycamore", "Willow")) +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
grid.arrange(plot6, plot7, ncol=1, nrow = 2)
There are slight differences between plots. However, since the foliage score was specifically calculated to capture habitat more extensively, it’s probably a better representation of the environment. I would like to see correlation between both but I might explore that more in depth later.
environment$total_FS <- habitat_foliage_scores[match(environment$sites, habitat_foliage_scores$Site), "Total"]
plot8 <- ggplot(environment, aes(x=sites, y=total_FS)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Total foliage score") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
plot8
There appears to be quite a lot of variation among sites in terms of foliage score.
lm9 <- lm(total_FS ~ elevation, data = environment)
summary(lm9)
##
## Call:
## lm(formula = total_FS ~ elevation, data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.01 -11.21 -4.18 13.09 52.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91.16502 4.90875 18.572 < 2e-16 ***
## elevation -0.14121 0.02586 -5.461 2.36e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.36 on 42 degrees of freedom
## Multiple R-squared: 0.4152, Adjusted R-squared: 0.4013
## F-statistic: 29.82 on 1 and 42 DF, p-value: 2.361e-06
ggplot(environment, aes(x=elevation, y=total_FS)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Elevation", y = "Total foliage score") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
As elevation increases, total foliage score decreases, as does tree diversity. However, even if tree diversity and foliage score were positively correlated, I can’t see a direct link as foliage cover is great in several sites dominated by one or a couple taxa (e.g. site SLS).
environment$oak_FS <- habitat_foliage_scores[match(environment$sites, habitat_foliage_scores$Site),"Oak_FS"]
plot9 <- ggplot(environment, aes(x=sites, y=oak_FS)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Oak foliage score") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
plot9
This plot looks particularly similar to
Absolute number of oaks:
lm10 <- lm(oak_FS ~ TOTAL_oak, data = environment)
summary(lm10)
##
## Call:
## lm(formula = oak_FS ~ TOTAL_oak, data = environment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7770 -0.6696 0.0834 0.2424 26.1640
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08340 1.15759 -0.072 0.943
## TOTAL_oak 0.84577 0.03505 24.129 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.479 on 42 degrees of freedom
## Multiple R-squared: 0.9327, Adjusted R-squared: 0.9311
## F-statistic: 582.2 on 1 and 42 DF, p-value: < 2.2e-16
ggplot(environment, aes(x=TOTAL_oak, y=oak_FS)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
theme_bw() +
labs(x = "Absolute number of oaks", y = "Oak foliage score") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(margin = unit(c(1, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm")))
Both variables are correlated, but not 1-to-1 (although model fit is quite good: R^2 = 0.9327143).
environment$birch_FS <- habitat_foliage_scores[match(environment$sites, habitat_foliage_scores$Site),"Birch_FS"]
plot10 <- ggplot(environment, aes(x=sites, y=oak_FS)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Oak foliage score") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
environment$sycamore_FS <- habitat_foliage_scores[match(environment$sites, habitat_foliage_scores$Site),"Sycamore_FS"]
plot11 <- ggplot(environment, aes(x=sites, y=sycamore_FS)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Sycamore foliage score") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
environment$willow_FS <- habitat_foliage_scores[match(environment$sites, habitat_foliage_scores$Site),"Willow_FS"]
plot12 <- ggplot(environment, aes(x=sites, y=willow_FS)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Willow foliage score") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
environment$beech_FS <- habitat_foliage_scores[match(environment$sites, habitat_foliage_scores$Site),"Beech_FS"]
plot13 <- ggplot(environment, aes(x=sites, y=beech_FS)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Beech foliage score") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
oak_birch_sycamore_FS <- habitat_foliage_scores %>%
select(c(Site,Oak_FS,Birch_FS,Sycamore_FS)) %>%
mutate(sum = Oak_FS + Birch_FS, Sycamore_FS)
environment$oak_birch_sycamore_FS <- oak_birch_sycamore_FS[match(environment$sites, oak_birch_sycamore_FS$Site),"sum"]
plot14 <- ggplot(environment, aes(x=sites, y=oak_birch_sycamore_FS)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Oak, birch and sycamore foliage scores combined") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
environment$oak_FS_prop <- habitat_foliage_scores[match(environment$sites, habitat_foliage_scores$Site), "Oak_prop"]
plot15 <- ggplot(environment, aes(x=sites, y=oak_FS_prop)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Oak foliage score (proportion)") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
oak_birch_sycamore_FS_prop <- habitat_foliage_scores %>%
select(c(Site,Oak_prop,Birch_prop,Sycamore_prop)) %>%
mutate(sum = Oak_prop + Birch_prop, Sycamore_prop)
environment$oak_birch_sycamore_FS_prop <- oak_birch_sycamore_FS_prop[match(environment$sites, oak_birch_sycamore_FS_prop$Site),"sum"]
plot16 <- ggplot(environment, aes(x=sites, y=oak_birch_sycamore_FS_prop)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Oak, birch and sycamore foliage score (proportion and combined)") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order)
grid.arrange(plot8, plot9, plot11, plot12, plot13, plot14, plot15, plot16, ncol=2, nrow = 4)
Seeing these graphs, my first impression is that individual foliage scores might have the drawback of being a 0 for many sites whose average breeding performance might be quite different from one to another. However, combining them in a single model (as Shutt. et al (2018) did) could compensate for this and would potentially give us more detail information on what specific elements in the habitat enhance or decrease breeding success (as well as it give us the opportunity to compare results with those published by Shutt et al.(2018)). Even though they don’t include beech in their study, I was wondering wether it would be interesting to as, according to Ally, birds seem to do worse when there’s lots of beech around. I wonder if it could act as a “bad environment” and whether addressing it specifically would be useful.
As Sanjana suggested, I looked at the Scottish Government Urban Rural Classification and I have assigned to each site one category based on where they are in the map and which category is assigned to their area. Ideally I would assign them more precisely (f.e. using their postcode) but I am struggling to find the postcode indices for each year of the project and their respective Urban Rural Classification Category, so for now I have simply assigned them visually (I attached maps to the e-mail). What I’ve noticed by doing this is that there are a couple of sites (e.g. PTH, RSY) that are very, very close to Large Urban areas (or Other Urban Areas or Rest of Scotland, depending on the map and the classification system) but they are not within one. I would agree that, even if they are not within the borders of such areas, they probably are affected to some degree by them, but I don’t think that will be captured as usually they fall within the same category as other sites that are quite far away from any considerable urban area. I am very interested in characterising urbanisation and this is a good start, although I wonder whether measuring distance from each site to the closest Large Urban or Other Urban Area would be a more precise approach.
This being said, I have classified all sites according to all the Urban Rural Classification Systems (2-fold, 3-fold, 6-fold and 8-fold):
library(openxlsx)
urbanisation <- read.xlsx("C:/Users/julia/OneDrive - University of Edinburgh/EEB_MSc_UEd/Dissertation/diss/data/Urban_Rural_Classification_new.xlsx")
head(urbanisation)
## site mean_latitude mean_longitude year X2fold X3fold X6fold X8fold
## 1 EDI 55.97700 -3.414070 2014 2 2 5 6
## 2 RSY 56.02237 -3.413130 2014 2 2 5 6
## 3 FOF 56.05684 -3.382320 2014 2 2 5 6
## 4 BAD 56.12171 -3.445130 2014 2 2 5 6
## 5 LVN 56.17460 -3.355220 2014 2 2 5 6
## 6 DOW 56.16253 -3.423004 2014 2 2 5 6
ggplot(urbanisation, aes(x=site, y=X2fold)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Urbanisation (2-fold system)") +
theme(axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order) +
facet_wrap(~year) # this is probably not the best plot to use, but at the moment I couldn't code what I would like to see. Anyhow, I will provide the maps
ggplot(urbanisation, aes(x=site, y=X3fold)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Urbanisation (3-fold system)") +
theme(axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order) +
facet_wrap(~year)
ggplot(urbanisation, aes(x=site, y=X6fold)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Urbanisation (6-fold system)") +
theme(axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order) +
facet_wrap(~year)
ggplot(urbanisation, aes(x=site, y=X8fold)) +
geom_point() +
theme_bw() +
labs(x = "Sites", y = "Urbanisation (8-fold system)") +
theme(axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm")),
axis.title.y = element_text(margin = unit(c(0, 5, 0, 0), "mm"))) +
scale_x_discrete(limits = level_order) +
facet_wrap(~year)
In almost all classifications, all site fall within areas classified as rural (and depending on the classfication, bigger or smaller and more or less remote).